• Show All Code
  • Hide All Code

violation analysis

Restaurant group

2024-12-02

Analysis related to violation of Manhattan Restaurants

library(tidyverse)
library(plotly)
library(ggplot2)
library(knitr)
library(readr)
library(dplyr)
library(tidyr)
library(scales)

In this chapter, we analyze the relationship between inspection violations and grades. Our analysis focuses exclusively on data where the grades are A, B, or C, and excludes records with grades labeled as N (Not Yet Graded), Z (Grade Pending), or P (Grade Pending issued upon reopening after a closure). We also exclude “Not Applicable” entries for critical violation indicators.
Inspection violations are categorized into two groups: Critical and Not Critical. Critical violations represent the most significant risks for foodborne illnesses. To ensure accuracy and relevance, we begin by filtering out unnecessary data from the dataset before proceeding with the analysis.

# The same data cleaning process in demographics part, also only consider data with valid violation_description and critical_flag, and grade A, B, C.

manhattan_data = read_csv("Manhattan_Restaurant_Inspection_Results.csv", na = c("NA", "", "."))
str (manhattan_data)
## spc_tbl_ [94,616 × 27] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ CAMIS                : num [1:94616] 50140436 50158081 50152703 50160975 50161087 ...
##  $ DBA                  : chr [1:94616] "JUST SALAD" "THE MANNER" "MIDNIGHT BLUE" "BLUE BLOSSOM" ...
##  $ BORO                 : chr [1:94616] "Manhattan" "Manhattan" "Manhattan" "Manhattan" ...
##  $ BUILDING             : chr [1:94616] "2853" "58" "106" "108" ...
##  $ STREET               : chr [1:94616] "BROADWAY" "THOMPSON STREET" "EAST   19 STREET" "WEST   39 STREET" ...
##  $ ZIPCODE              : num [1:94616] 10025 10012 10003 10018 10025 ...
##  $ PHONE                : num [1:94616] 7.32e+09 9.17e+09 3.48e+09 6.47e+09 9.29e+09 ...
##  $ CUISINE DESCRIPTION  : chr [1:94616] NA NA NA NA ...
##  $ INSPECTION DATE      : chr [1:94616] "01/01/1900" "01/01/1900" "01/01/1900" "01/01/1900" ...
##  $ ACTION               : chr [1:94616] NA NA NA NA ...
##  $ VIOLATION CODE       : chr [1:94616] NA NA NA NA ...
##  $ VIOLATION DESCRIPTION: chr [1:94616] NA NA NA NA ...
##  $ CRITICAL FLAG        : chr [1:94616] "Not Applicable" "Not Applicable" "Not Applicable" "Not Applicable" ...
##  $ SCORE                : num [1:94616] NA NA NA NA NA NA NA NA NA NA ...
##  $ GRADE                : chr [1:94616] NA NA NA NA ...
##  $ GRADE DATE           : chr [1:94616] NA NA NA NA ...
##  $ RECORD DATE          : chr [1:94616] "11/05/2024" "11/05/2024" "11/05/2024" "11/05/2024" ...
##  $ INSPECTION TYPE      : chr [1:94616] NA NA NA NA ...
##  $ Latitude             : num [1:94616] 40.8 40.7 40.7 40.8 40.8 ...
##  $ Longitude            : num [1:94616] -74 -74 -74 -74 -74 ...
##  $ Community Board      : num [1:94616] 109 102 105 105 107 108 101 105 104 102 ...
##  $ Council District     : chr [1:94616] "07" "01" "02" "04" ...
##  $ Census Tract         : chr [1:94616] "019900" "004700" "005000" "011300" ...
##  $ BIN                  : num [1:94616] 1075440 1087362 1017905 1015273 1055676 ...
##  $ BBL                  : num [1:94616] 1.02e+09 1.00e+09 1.01e+09 1.01e+09 1.02e+09 ...
##  $ NTA                  : chr [1:94616] "MN09" "MN24" "MN21" "MN17" ...
##  $ Location Point1      : logi [1:94616] NA NA NA NA NA NA ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   CAMIS = col_double(),
##   ..   DBA = col_character(),
##   ..   BORO = col_character(),
##   ..   BUILDING = col_character(),
##   ..   STREET = col_character(),
##   ..   ZIPCODE = col_double(),
##   ..   PHONE = col_double(),
##   ..   `CUISINE DESCRIPTION` = col_character(),
##   ..   `INSPECTION DATE` = col_character(),
##   ..   ACTION = col_character(),
##   ..   `VIOLATION CODE` = col_character(),
##   ..   `VIOLATION DESCRIPTION` = col_character(),
##   ..   `CRITICAL FLAG` = col_character(),
##   ..   SCORE = col_double(),
##   ..   GRADE = col_character(),
##   ..   `GRADE DATE` = col_character(),
##   ..   `RECORD DATE` = col_character(),
##   ..   `INSPECTION TYPE` = col_character(),
##   ..   Latitude = col_double(),
##   ..   Longitude = col_double(),
##   ..   `Community Board` = col_double(),
##   ..   `Council District` = col_character(),
##   ..   `Census Tract` = col_character(),
##   ..   BIN = col_double(),
##   ..   BBL = col_double(),
##   ..   NTA = col_character(),
##   ..   `Location Point1` = col_logical()
##   .. )
##  - attr(*, "problems")=<externalptr>

cleaned_data = manhattan_data %>%
  janitor::clean_names() %>%  
  filter(
    !is.na(dba),                         
    !is.na(cuisine_description),       
    !is.na(grade),                       
    !is.na(score),                       
    !is.na(zipcode),
    !is.na(violation_description),
    !is.na(critical_flag),
    grade %in% c("A", "B", "C")
  ) %>% mutate(region = case_when(
    zipcode >= 10000 & zipcode <= 10025 ~ "Downtown",
    zipcode >= 10026 & zipcode <= 10040 ~ "Midtown",
    zipcode >= 10041 & zipcode <= 10282 ~ "Uptown",
    TRUE ~ "Other" # For ZIP codes like 11371, 12345, etc.
  ))

First, we analyze the frequency distribution of grades. For both Critical and Not Critical groups, the majority of restaurants received a Grade A, followed by Grade B, and then Grade C. However, only within Grade A do we observe more restaurants with Not Critical violations compared to Critical violations.

# Analyze the effect of violation_description on grade
# Group by critical flag and grade to calculate counts
violation_grade_summary <- cleaned_data %>%
  group_by(critical_flag, grade) %>%
  summarize(count = n(), .groups = "drop")

# Calculate average scores by CRITICAL_FLAG and grade
average_score <- cleaned_data %>%
  group_by(critical_flag, grade) %>%
  summarize(avg_score = mean(score), .groups = "drop")


# Create an EDA of grade and count using plotly
violation_grade_summary %>% 
  plot_ly(
    x = ~grade, 
    y = ~count,
    type = 'bar',
    color = ~critical_flag,
    colors = "viridis") %>% 
  layout(
    title = "Effect of Violation Type on Restaurant Grade",
    xaxis = list(title = "Grade"),   
    yaxis = list(title = "Restaurant Number") 
  )

Additionally, when examining inspection scores across all grade categories, restaurants with Not Critical violations consistently achieve lower scores. This pattern aligns with the corresponding grade results, further highlighting the relationship between inspection outcomes and grade assignments.

# Create an EDA of score and count using plotly
average_score %>% 
  plot_ly(
    x = ~grade, 
    y = ~avg_score,
    type = 'bar',
    color = ~critical_flag,
    colors = "viridis") %>% 
  layout(
    title = "Effect of Violation Type on Restaurant Score",
    xaxis = list(title = "Grade"),   
    yaxis = list(title = "Restaurant Average Score") 
  )

Next, we conducted a Chi-Square Test to examine the relationship between the critical flag and grade. The results are visualized using a Contingency Table Heatmap. The analysis shows a p-value smaller than 0.05, leading us to reject the null hypothesis that the critical flag and grade are independent. This indicates a significant relationship between the critical flag and grade at the 0.05 significance level.

# Chi-Square Test
contingency_table <- table(cleaned_data$critical_flag, cleaned_data$grade)
chi_test <- chisq.test(contingency_table)
print(chi_test)
## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table
## X-squared = 821.92, df = 2, p-value < 2.2e-16


# Convert the contingency table to a matrix for visualization
contingency_matrix <- as.matrix(contingency_table)


# Extract chi-square test statistic and p-value
chi_stat <- round(chi_test$statistic, 2)
chi_pval <- format.pval(chi_test$p.value, digits = 3)

# Add the chi-square test result as annotations
plot_ly(
  x = colnames(contingency_matrix),
  y = rownames(contingency_matrix),
  z = contingency_matrix,
  type = "heatmap",
  colors = colorRamp(c("white", "purple"))
) %>%
  layout(
    title = "Contingency Table Heatmap with Chi-Square Result",
    xaxis = list(title = "Grade"),
    yaxis = list(title = "Critical Flag"),
    colorbar = list(title = "Count"),
    annotations = list(
      list(
        x = 2.1, 
        y = 1.1,
        text = paste("Chi-Square Statistic:", chi_stat, "<br>P-Value:", chi_pval),
        showarrow = FALSE,
        font = list(size = 14)
      )
    )
  )
## Warning: 'layout' objects don't have these attributes: 'colorbar'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'

Then, we analyze the relationship between restaurant cuisine type and violation. To identify patterns, we calculate the percentage of critical violations for each cuisine type and rank them accordingly. Our analysis highlights the top 10 cuisine types with the highest percentage of violations, with Chinese/Japanese cuisine emerging as the category with the highest violation rate.
This finding provides valuable insights into how specific cuisine types are associated with inspection outcomes, offering a deeper understanding of potential factors influencing violation frequencies. Such insights could guide further investigation into operational practices or compliance challenges unique to these cuisine categories.

# Group by cuisine_description and calculate total and critical counts
violation_cuisine <- cleaned_data %>%
  mutate(critical_flag = ifelse(critical_flag == "Critical", 1, 0)) %>%
  group_by(cuisine_description) %>%
  summarize(total_violations = n(),
            critical_violations = sum(critical_flag, na.rm = TRUE),
            critical_percent = (critical_violations / total_violations),
            critical_percentage = percent(critical_percent, accuracy = 0.01)
            ) %>%
  arrange(desc(critical_percentage))  %>% # Sort by percentage of critical violations 
  head(10) # View top 10 restaurant types with the highest percentage of critical violations

plot_ly(
  type = 'table',
  header = list(
    values = c("Cuisine Description", "Critical Percentage"),
    align = c("center", "center"),
    font = list(size = 14, color = "white"),
    fill = list(color = "purple")
  ),
  cells = list(
    values = rbind(violation_cuisine$cuisine_description, violation_cuisine$critical_percentage),
    align = c("center", "center"),
    font = list(size = 12),
    fill = list(color = c("white", "white"))
  )
)

Among all restaurants with critical violations, we aimed to identify the most common types of violations. To achieve this, we calculated the percentage of each violation type and grouped all categories with less than 3% into an “Other” category for clarity. The results are summarized in a pie chart, providing a clear visualization of the distribution of critical violations.
The analysis reveals that the most frequent critical violation is related to improper temperature control of TCS (Time/Temperature Control for Safety) foods. Specifically, this includes cold TCS food items held above 41°F, smoked or processed fish held above 38°F, intact raw eggs held above 45°F, or reduced oxygen packaged (ROP) TCS foods held above the required temperatures, except during active necessary preparation.
This finding underscores the importance of proper temperature management in ensuring food safety and reducing the risk of foodborne illnesses.

violation_critical <- cleaned_data %>%
  filter(critical_flag == "Critical") %>% # 19,953 rows in total
  group_by(violation_description) %>%
  summarize(num_violation = n(),
            violation_percent = num_violation / 19953,  
            violation_percentage = percent(violation_percent, accuracy = 0.01)) %>%
  mutate(
    violation_description = if_else(violation_percent < 0.03, "Other", violation_description)  # Reassign small categories to "Other"
  ) %>%
  arrange(desc(violation_percent))

critical_total <- violation_critical %>%
  group_by(violation_description) %>%
  summarize(
            violation_percent = sum(violation_percent),  
            violation_percentage = percent(violation_percent, accuracy = 0.01)) #combine percentage of "other"


# Create a pie chart
plot_ly(
  data = critical_total,
  labels = ~violation_description,  # Labels for slices
  values = ~violation_percent,  # Proportions for slices
  type = 'pie',  # Pie chart type
  hoverinfo = 'label+value+percent'  # Display information on hover
) %>%
  layout(title = "Violation Description Percentages",
         legend = list( x = 1.2, y = 0.5 ))